From basics to more sophisticated command for data visualisation
In [1]:
import numpy as np
import xlrd #With pandas
import matplotlib.pyplot as plt
import pandas as pd
In [1]:
#Exponentiation
print(4 ** 4)
In [1]:
#Types and converstion
mInt = 6
mFloat = .4
mString = "Hey"
mConversion = str(mFloat)
print (mInt, mFloat, mString, mConversion, type(mConversion))
In [9]:
a = "is"
b = "nice"
my_list = [["my", "nested", "list"], a, b]
print (my_list)
In [14]:
print (my_list[-1], " == ", my_list [2])
In [17]:
#Slicing and dicing
x = ["0", "1", "2", "3"]
print (x[1:3]) #end border is exclusif
print (x[:2])
In [30]:
x = [["a", "b", "c"],
["d", "e", "f"],
["g", "h", "i"]]
print(x[2][0])
print(x[2][:2])#Intersection of both index
In [31]:
x = x + [["j", "k", "l"]] #add a sublist with double [[]]
print (x)
In [32]:
# Create areas_copy
X_pointer = x #Point to the same list (memory adress)
y = list(x) #Y is an other list
del(x[1])
print (y)
In [34]:
z = [11.25, 18.0, 20.0]
# Sort full in descending order: full_sorted
full_sorted = sorted (z, reverse=True)
# Print out full_sorted
print(full_sorted)
In [39]:
#Search index
print(z.index(20.0))
# Print out how often 14.5 appears in areas
print (z.count(18.0))
In [16]:
# house list of lists
house = [["hallway", 11.25],
["kitchen", 18.0],
["living room", 20.0],
["bedroom", 10.75],
["bathroom", 9.50]]
# Build a for loop from scratch
for x,y in house:
print("the " + str(x) + " is " + str(y) + " sqm")
In [ ]:
#to get the index
areas = [11.25, 18.0, 20.0, 10.75, 9.50]
# Code the for loop
for index, area in enumerate(areas) :
print("room " + str(index) + ": " + str(area))
In [3]:
# Import the numpy package as np
import numpy as np
In [21]:
# Create list baseball
baseball = [180, 215, 210, 210, 188, 176, 209, 200]
# Create a Numpy array from baseball: np_baseball
np_baseball = np.array(baseball)
print (baseball)
print (np_baseball)
In [6]:
#Perform operation on all datas
print (np_baseball * 10)
In [2]:
#Perform condition
print (np_baseball [np_baseball < 200]) #the conditition is creating an boolean array
#So we can get indexes from one np.array to slect value in a second one
#Compare two array one by one element
my_house = np.array([18.0, 20.0, 10.75, 9.50])
your_house = np.array([14.0, 24.0, 14.25, 9.0])
print(my_house < your_house)
# my_house greater than 18.5 or smaller than 10
print(np.logical_or(my_house > 18.5,
my_house < 10))
In [ ]:
#Load from txt file
digits = np.loadtxt(file,
delimiter=',',
skiprows=1 #If the first row is a header
)
#plt.scatter(data_float[:, 0], data_float[:, 1]) # premiere colonne indice, seconde les valeurs
In [10]:
data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None) #names means the first row is the label
In [50]:
# Create baseball, a list of lists
baseball = [[180, 78.4],
[215, 102.7],
[210, 98.5],
[188, 75.2]]
# Create a 2D Numpy array from baseball: np_baseball
np_baseball = np.array (baseball)
# Print out the shape of np_baseball
print (np_baseball.shape)
In [54]:
# Print out the 4th row of np_baseball
print (np_baseball[3,:])
# Select the entire second column of np_baseball: np_weight
print (np_baseball[:,1])
In [55]:
#Appllying a filter on an array
conversion = np.array([10, 1000])
# Print out product of np_baseball and conversion
print ( np_baseball * conversion)
In [57]:
# Print out the mean of height
print( np.mean(np_baseball[:,0]))
# Print out the median of weight
print( np.median(np_baseball[:,1]))
In [58]:
#Are these columns correlated ?
print(np.corrcoef(np_baseball[:,0],np_baseball[:,1]))
In [26]:
# For loop over np_height
np_height = np.array ([180, 215, 210, 210, 188, 176, 209, 200])
for l in np_height:
print (str(l) + " inches")
# Create a 2D Numpy array from baseball: np_baseball
baseball = [[180, 78.4],
[215, 102.7],
[210, 98.5],
[188, 75.2]]
np_baseball = np.array (baseball)
# For loop over np_baseball: every elements one by line
for l in np.nditer(np_baseball):
print (l)
In [3]:
np.random.seed(123)
# Use randint() to simulate a dice
print(np.random.randint(1, 7))
In [4]:
import matplotlib.pyplot as plt
import numpy as np
x = np.array ( [1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016])
y = np.array ( [0.10000000000000001, 0.40000000000000002, 1.0999999999999999, 2.4000000000000008, 7.9999999999999876, 14.499999999999979, 30.700000000000152, 64.600000000000421, 109.19999999999807, 218.9999999999977, 371.79999999999768, 811.90000000000293, 1695.6000000000022, 2789.7999999999979, 4949.3999999999842, 12152.599999999993, 22639.799999999974, 32434.200000000077, 58928.000000000015, 123415.90000000002, 223653.8000000001, 273763.70000000042, 362651.79999999981, 567354.00000000012])
y = y * 1E15
In [5]:
plt.plot(x,y) #Plot line
plt.show()
plt.scatter(x,y) #Plotpoint
plt.show()
plt.hist(y, bins=20) #Histogram with 20 rectangles
plt.show()
In [19]:
col = {
'1993':'red',
'Europe':'green',
'2014':'blue',
'2015':'yellow',
'Oceania':'black'
}
plt.scatter(x, y,
alpha=.8, #Oppacity
s=(x-1990) **2 #Size of bubble: can be a np_array (no sens in this example)
#c=col #Change the bubble's color depending on the value (TODO)
)
plt.yscale('log') #Log scale on y
plt.xlabel("Years")
plt.ylabel("Computing power FLOP/S")
plt.title ("Analysis of Top500 performance \n(Sum of the 500 supercomputer's performance)")
plt.grid(True)
#Annotation
plt.text(2005, 1E17, 'These bubbles have no sense!')
#Arrow
texteX1 = 2000
texteY1 = 1E15
flecheX1 = 2000
flecheY1 = 1E16*4
plt.annotate('Nice bubble here',
xy=(flecheX1, flecheY1), xycoords='data',
xytext=(texteX1, texteY1), textcoords='data',
arrowprops=dict(arrowstyle="->",
linewidth = 5.,
color = 'red'),
)
#Set custome min and max value for x and y axis.
x1,x2,y1,y2 = plt.axis()
plt.axis((1990,x2,y1,y2))
# Definition of tick_val and tick_lab
tick_val = [1E12,1E15,1E18, 1E21]
tick_lab = ['GigaFlops','PetaFlops','ExaFlops', 'ZettaFlops']
plt.yticks(tick_val, tick_lab)
plt.show()
In [18]:
# Definition of dictionary
europe = {'spain':'madrid', 'france':'paris', 'germany':'berlin', 'norway':'oslo' }
# Add italy to europe
europe['italy'] = 'rome'
# Remove australia
del europe['germany']
# Print out the keys in europe
print(europe.keys())
# Print out value that belongs to key 'norway'
print(europe['norway'])
In [20]:
#Iterate over europe
for key, value in europe.items() :
print("the capital of " + key + " is " + str(value))
In [17]:
# Dictionary of dictionaries
europe = { 'spain': { 'capital':'madrid', 'population':46.77 },
'france': { 'capital':'paris', 'population':66.03 },
'germany': { 'capital':'berlin', 'population':80.62 },
'norway': { 'capital':'oslo', 'population':5.084 } }
# Add data to europe under key 'italy'
data = { 'capital':'rome', 'population':59.83}
europe ['italy'] = data
print (europe)
Allow to perform more complexe fonction on data array.
In [2]:
import pandas as pd
file = "cars.csv"
#read a file
cars = pd.read_csv(file, index_col = 0)
#read and clean
data = pd.read_csv(file,
sep='\t',
comment='#', #Char reprensenting a comment in the data file
na_values=['Nothing'] #Value that we want to be NaN
)
#4th first lines
print (cars.head())
In [4]:
# Assign spreadsheet filename: file
file = "battledeath.xlsx"
# Load spreadsheet: xl
xl = pd.ExcelFile(file)
# Print sheet names
print(xl.sheet_names)
#Select a sheet as a DataFrame
df1 = xl.parse('2002') #By sheet Name
df2 = xl.parse(0) #By sheet Index
In [3]:
# Print out country column as Pandas Series
print (cars['country'], "\n")
# Print out country column as Pandas DataFrame
print (cars[['country']])
# Print out DataFrame with country and drives_right columns
print (cars[['country', 'drives_right']])
# Print out first 3 lines
print(cars[0:3])
# Without Index
print(cars.iloc[0:3].to_string(index=False) )
In [28]:
# Print out observations for Australia and Egypt
print (cars , "\n --- --- ---")
#Line Selection by label with [[lines],[columns]]
print (cars.loc[['AUS', 'EG']], "\n --- --- ---")
print(cars.loc['MOR', 'drives_right'] , "\n --- --- ---")
print(cars.loc[['RU', 'MOR'], ['country', 'drives_right']], "\n --- --- ---")
#iloc: get with integer
print (cars.iloc[:,1], "\n --- --- ---") #All lines, only first column
In [15]:
# Print out drives_right value of Morocco
print(cars.loc['MOR', 'drives_right'])
# Print sub-DataFrame
print(cars.loc[['RU', 'MOR'], ['country', 'drives_right']])
#print une colonne
print(cars.loc[:,'drives_right'])
# Print out cars_per_cap and drives_right as DataFrame
print(cars.loc[:, ['cars_per_cap', 'drives_right']])
In [27]:
# Create medium: observations with cars_per_cap between 100 and 500
cpc = cars['cars_per_cap']
between = np.logical_and(cpc > 100, cpc < 500)
medium = cars[between]
print (medium)
In [36]:
# Iterate over rows of cars
for lab, row in cars.iterrows() :
print ("---->" + str(lab))
#add a column
cars.loc[lab, "COUNTRY"] = (cars.loc[lab, "country"]).upper()
#More efficient version with .apply function
cars["smallC"] = cars["country"].apply(str.lower)
print (cars)
In [20]:
#Plotting two plots on the same figure
pp = cars.plot(x='country', y='cars_per_cap', kind='bar')
cars.plot(x='country', y='cars_per_cap', ax=pp) #Refer to the same figure with ax=pp
plt.show()
In [16]:
# Read & print the first 3 lines
with open('cars.csv') as file:
print(file.readline())
In [ ]:
with open('data.pkl', 'rb') as file: #b because file is a bytestream. Not human readable
d = pickle.load(file)